Each record in the database describes a Boston suburb or town.
I'll evaluate the algorithms' performance using RMSE and R2 metrics.
RMSE will give a gross idea of how wrong all predictions are (0 is perfect) and R2 will give an idea of how well the model has fit the data (1 is perfect, 0 is worst).
My best-performing model had a RMSE score of 2.81 on an unseen validation set. It used a tuned SVM algorithm. The second best was a tuned Cubist algorithm with an RMSE score of 2.90 on a validation set.
In [111]:
# first install packages devtools and pacman manually
#pull in source functions from github
devtools::source_url('https://raw.githubusercontent.com/jsphyg/Machine_Learning_Notebooks/master/myRfunctions.R')
#source("C:\\Work\\myRfunctions.R")
fnRunDate()
fnInstallPackages()
In [112]:
# import data changing any blanks and string NAs to NAs
dataset <- read_csv("C:\\Work\\kaggle_house_prices\\train.csv", na = c("","NA"))
test <- read_csv("C:\\Work\\kaggle_house_prices\\test.csv", na = c("","NA"))
head(dataset)
head(test)
In [113]:
Hmisc::describe(dataset, listunique=1)
In [114]:
psych::describe(dataset, check = T, skew = TRUE, ranges = TRUE, quant = TRUE)
In [115]:
fnMissingDataPercent(data = dataset)
In [116]:
# combine test and train.
dataset <- dplyr::bind_rows(dataset, test)
In [117]:
# quickly replace NAs. if numeric, replace with -1, if character replace with 'unknown'
# this gets rid of all NAs
dataset <- dataset %>% mutate_if(is.numeric, funs(ifelse(is.na(.), -1, .)))
dataset <- dataset %>% mutate_if(is.character, funs(ifelse(is.na(.), 'unknown', .)))
In [118]:
# combine the data to create dummy variables with caret. if data is split in train and test, i get errors
#create dummy variables
dmy <- caret::dummyVars(" ~ .", data = dataset, fullRank = T)
dataset <- as_tibble(predict(dmy, newdata = dataset))
#make the names usable in R
names(dataset) <- make.names(names(dataset), unique = TRUE)
dim(dataset)
head(test)
str(dataset)
In [119]:
# split back out the dataset and test
test <- dplyr::filter(dataset, Id > 1460)
dataset <- dplyr::filter(dataset, Id <= 1460)
#drop the sktest target column full of NAs
test$SalePrice <- NULL
dim(dataset)
dim(test)
In [120]:
#set aside a final full set of data to train on before splitting a validation set
final_dataset <- dataset
# split a validation dataset
validation_index <- createDataPartition(dataset$SalePrice, p=0.80, list=FALSE)
validation <- dataset[-validation_index,]
dataset <- dataset[validation_index,]
In [121]:
dataset_slice <- dplyr::select(dataset, everything()) %>% dplyr::slice(., 1:100)
In [122]:
formula <- SalePrice ~ .
In [134]:
# Ensemble Methods
ds <- dataset_slice
# try ensembles
control <- trainControl(method="cv", number=10)
metric <- "RMSE"
# Random Forest
set.seed(9)
fit.rf <- train(formula, data=ds, method="rf", preProc=c("medianImpute"), metric=metric, trControl=control, na.action = na.pass)
# Stochastic Gradient Boosting
set.seed(9)
fit.gbm <- train(formula, data=ds, method="gbm", preProc=c("medianImpute"),metric=metric, trControl=control, verbose=FALSE, na.action = na.pass)
# Cubist
set.seed(9)
fit.cubist <- train(formula, data=ds, method="cubist", preProc=c("medianImpute"),metric=metric, trControl=control, na.action = na.pass)
# xgb
set.seed(9)
fit.xgb <- train(formula, data=ds, method="xgbTree", preProc=c("medianImpute"),metric=metric, trControl=control, na.action = na.pass)
# Compare algorithms
ensemble_results <- resamples(list(RF=fit.rf, GBM=fit.gbm, CUBIST=fit.cubist, XGB=fit.xgb))
summary(ensemble_results)
bwplot(ensemble_results)
In [135]:
fit.cubist
plot(fit.cubist)
Cubist was best. First use random to find better tuning values. then tune around them.
In [136]:
# Tune the Cubist algorithm
ds <- dplyr::select(dataset, everything()) %>% dplyr::slice(., 1:1000)
control <- trainControl(method="cv", number=10, search='random')
metric <- "RMSE"
set.seed(7)
rand.cubist <- train(formula, data=ds, method="cubist", metric=metric, trControl=control, tuneLength = 20)
print(rand.cubist)
plot(rand.cubist)
In [142]:
# Tune the Cubist algorithm
control <- trainControl(method="cv", number=5)
metric <- "RMSE"
set.seed(7)
grid <- expand.grid(.committees=seq(5, 40, by=5), .neighbors=8)
tune.cubist <- train(formula, data=dataset, method="cubist", preProc=c("zv","medianImpute","BoxCox"),metric=metric, tuneGrid=grid, trControl=control, na.action = na.pass)
print(tune.cubist)
plot(tune.cubist)
In [137]:
set.seed(13)
predictions <- predict(rand.cubist, newdata=validation, na.action=na.pass)
MLmetrics::RMSE(predictions, validation$SalePrice)
MLmetrics::RMSLE(predictions, validation$SalePrice)
In [138]:
# make predictions on the test set for Kaggle submission
test$prediction <- predict(rand.cubist, newdata = test, na.action = na.pass)
head(test)
nrow(data.frame(test))
In [139]:
my_solution <- dplyr::select(test, Id = Id, SalePrice = prediction)
my_solution$Id <- as.character(my_solution$Id)
readr::write_csv(x = data.frame(my_solution), path = "C:\\Work\\my_solution.csv")
head(my_solution, n=5)
tail(my_solution, n=5)
In [41]:
# correlation between results
modelCor(ensemble_results)
splom(ensemble_results)
Cubist was the most accurate with an RMSE that was lower. I'll tune Cubist and see if I canget more out of it.
Cubist has two parameters that are tunable with caret: committees which is the number of boosting operations and neighbors which is used during prediction and is the number of instances used to correct the rule-based prediction (although the documentation is perhaps a little ambiguous on this).
For more information about Cubist see the help on the function ?cubist. Let’s first look at the default tuning parameter used by caret that resulted in our accurate model.
In [42]:
print(fit.cubist)
We can see that the best RMSE was achieved with committees = 20 and neighbors = 5 Let’s use a grid search to tune around those values. We’ll try all committees between 15 and 25 and spot-check a neighbors value above and below 5.
In [50]:
# Tune the Cubist algorithm
control <- trainControl(method="cv", number=10)
metric <- "RMSE"
set.seed(7)
grid <- expand.grid(.committees=seq(90, 105, by=1), .neighbors=seq(8,9, by=1))
tune.cubist <- train(formula, data=dataset, method="cubist", preProc=c("medianImpute"),metric=metric, tuneGrid=grid, trControl=control, na.action = na.pass)
print(tune.cubist)
plot(tune.cubist)
In [45]:
# Tune the Cubist algorithm
control <- trainControl(method="cv", number=10, search='random')
metric <- "RMSE"
set.seed(13)
rand.cubist <- train(formula, data=dataset, method="cubist", preProc=c("medianImpute"), metric=metric, trControl=control, tuneLength = 20, na.action = na.pass)
In [46]:
print(rand.cubist)
plot(rand.cubist)
In [47]:
set.seed(13)
predictions <- predict(rand.cubist, newdata=validation, na.action=na.pass)
MLmetrics::RMSE(predictions, validation$SalePrice)
MLmetrics::RMSLE(predictions, validation$SalePrice)
In [48]:
# make predictions on the test set for Kaggle submission
test$prediction <- predict(rand.cubist, newdata = test, na.action = na.pass)
head(test)
nrow(data.frame(test))
In [49]:
my_solution <- dplyr::select(test, Id = Id, SalePrice = prediction)
readr::write_csv(x = data.frame(my_solution), path = "C:\\Work\\my_solution.csv")
head(my_solution, n=20)
tail(my_solution, n=20)
In [ ]:
In [ ]:
In [ ]:
In [56]:
# lm
set.seed(7)
fit.lm1 <- train(formula, data=dataset, method="lm", metric=metric, trControl=control)
# GLM
set.seed(7)
fit.glm1 <- train(formula, data=dataset, method="glm", metric=metric, trControl=control)
# GLMNET
set.seed(7)
fit.glmnet1 <- train(formula, data=dataset, method="glmnet", metric=metric, trControl=control)
# SVM
set.seed(7)
fit.svm1 <- train(formula, data=dataset, method="svmRadial", metric=metric, trControl=control)
# CART
set.seed(7)
grid <- expand.grid(.cp=c(0, 0.05, 0.1))
fit.cart1 <- train(formula, data=dataset, method="rpart", metric=metric, tuneGrid=grid, trControl=control)
# kNN
set.seed(7)
fit.knn1 <- train(formula, data=dataset, method="knn", metric=metric, trControl=control)
# Compare algorithms
results <- resamples(list(LM1=fit.lm1, GLM1=fit.glm1, GLMNET1=fit.glmnet1, SVM1=fit.svm1, CART1=fit.cart1, KNN1=fit.knn1))
summary(results)
dotplot(results)
In [57]:
# correlation between results
modelCor(results)
splom(results)
Are high correlations among features preventing a better prediction?
In [58]:
# remove correlated attributes
# find attributes that are highly corrected
set.seed(7)
cutoff <- 0.70
correlations <- cor(dataset[,1:13])
highlyCorrelated <- findCorrelation(correlations, cutoff=cutoff)
for (value in highlyCorrelated) {
print(names(dataset)[value])
}
# create a new dataset without highly corrected features
dataset_features <- dataset[,-highlyCorrelated]
dim(dataset_features)
4 features dropped for being over 70% correlated. Now I'll run the baseline again on the new dataset.
In [59]:
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "RMSE"
# lm
set.seed(7)
fit.lm <- train(medv~., data=dataset_features, method="lm", metric=metric, preProc=c("center", "scale"), trControl=control)
# GLM
set.seed(7)
fit.glm <- train(medv~., data=dataset_features, method="glm", metric=metric, preProc=c("center", "scale"), trControl=control)
# GLMNET
set.seed(7)
fit.glmnet <- train(medv~., data=dataset_features, method="glmnet", metric=metric, preProc=c("center", "scale"), trControl=control)
# SVM
set.seed(7)
fit.svm <- train(medv~., data=dataset_features, method="svmRadial", metric=metric, preProc=c("center", "scale"), trControl=control)
# CART
set.seed(7)
grid <- expand.grid(.cp=c(0, 0.05, 0.1))
fit.cart <- train(medv~., data=dataset_features, method="rpart", metric=metric, tuneGrid=grid, preProc=c("center", "scale"), trControl=control)
# kNN
set.seed(7)
fit.knn <- train(medv~., data=dataset_features, method="knn", metric=metric, preProc=c("center", "scale"), trControl=control)
# Compare algorithms
feature_results <- resamples(list(LM=fit.lm, GLM=fit.glm, GLMNET=fit.glmnet, SVM=fit.svm, CART=fit.cart, KNN=fit.knn))
summary(feature_results)
dotplot(feature_results)
In [60]:
fit.svm
Removing highly-correlated predictors helped.
Now I'll tune the SVM algorithm.
The C parameter is the cost constraint used by SVM. Learn more in the help for the ksvm function ?ksvm. We can see from previous results that a C value of 1.0 is a good starting point. Let’s design a grid search around a C value of 1. We might see a small trend of decreasing RMSE with increasing C, so let’s try all integer C values between 1 and 10. Another parameter that caret let’s us tune is the sigma parameter. This is a smoothing parameter. Good sigma values often start around 0.1, so we will try numbers before and after.
In [73]:
# tune SVM sigma and C parametres
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "RMSE"
set.seed(7)
grid <- expand.grid(.sigma=c(0.025, 0.05, 0.1, 0.15), .C=seq(1, 15, by=1))
fit.svm <- train(formula, data=dataset, method="svmRadial", metric=metric, tuneGrid=grid, preProc=c("BoxCox"), trControl=control)
print(fit.svm)
plot(fit.svm)
We can see that the sigma values flatten out with larger C cost constraints. It looks like we might do well with a sigma of 0.05 and a C of 8. This gives us a respectable RMSE of 2.977085. If we wanted to take this further, we could try even more fine tuning with more grid searches. We could also explore trying to tune other parameters of the underlying ksvm() function. Finally and as already mentioned, we could perform some grid searches on the other nonlinear regression methods.
In [74]:
set.seed(13)
predictions <- predict(fit.svm, newdata=validation, na.action=na.pass)
MLmetrics::RMSE(predictions, validation$medv)
In [63]:
glimpse(dataset_features)
In [68]:
# apply spatialSign
centerScale <- caret::preProcess(dataset_features[,1:9], method = c(
"center","scale",
"spatialSign" #needs to be scaled first
))
ssData <- predict(centerScale, newdata = dataset_features)
In [69]:
head(ssData)
In [71]:
# tune SVM sigma and C parametres
control <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "RMSE"
set.seed(7)
grid <- expand.grid(.sigma=c(0.025, 0.05, 0.1, 0.15, 0.2, 0.25), .C=seq(1, 15, by=1))
fit.svm <- train(formula, data=ssData, method="svmRadial", metric=metric, tuneGrid=grid, preProc=c("YeoJohnson"), trControl=control)
print(fit.svm)
plot(fit.svm)
In [72]:
set.seed(13)
predictions <- predict(fit.svm, newdata=validation, na.action=na.pass)
caret:::RMSE(predictions, validation$medv)
MLmetrics::RMSE(predictions, validation$medv)
In [ ]: